COVID-19

Interpretable Machine Learning course homework no. 3\ Tymoteusz Makowski

Data source

Imports

In [16]:
import pandas as pd
import numpy as np

from datetime import datetime as dt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from lime.lime_tabular import LimeTabularExplainer
In [2]:
cols = [
    "age",
    "sex",
    "country",
    "date_onset_symptoms",
    "date_admission_hospital",
    "date_confirmation",
    "symptoms",
    "chronic_disease",
    "outcome",
    "date_death_or_discharge"
]

data_raw = pd.read_csv("data/covid19.csv", usecols=cols)
print(data_raw.shape)
data_raw.head(3)
(21241, 10)
Out[2]:
age sex country date_onset_symptoms date_admission_hospital date_confirmation symptoms chronic_disease outcome date_death_or_discharge
0 30 male China 18.01.2020 20.01.2020 22.01.2020 NaN NaN NaN NaN
1 47 male China 10.01.2020 21.01.2020 23.01.2020 NaN NaN NaN NaN
2 49 male China 15.01.2020 20.01.2020 23.01.2020 NaN NaN NaN NaN

Data preparation

Symptoms

In [3]:
symptoms_list = []
for symp in data_raw.symptoms:
    if not pd.isna(symp):
        symptoms_list += [s.strip() for s in symp.split(",")]
        
symptoms_raw = pd.Series(symptoms_list).map(lambda x: x.lower())
symptoms_counts = symptoms_raw.reset_index().rename(columns={0: "symptom"}).groupby("symptom").count().reset_index().rename(columns={"index": "cnt"})
signif_counts = symptoms_counts.query("cnt > 4").reset_index(drop=True)
In [4]:
# Greater than 4 cases.
signif_symptoms_list = [
    "asymptomatic", "bone pain", "chest tightness", "chills", "cough", "coughing", "diarrhea", "discomfort", "dyspnea",
    "fatigue", "fever", "headache", "joint pain", "malaise", "myalgia", "nausea", "phlegm", "pneumonia", "pneumonitis",
    "runny nose", "shortness of breath", "sneezing", "sore throat", "sputum"
]

signif_symptoms = dict(
    [("MuscleSoreness", "soreness"), ("Weakness", "weak")] +\
    [(s.title().replace(" ", ""), s) for s in signif_symptoms_list]
)

Utility functions

In [5]:
def map_outcome(x):
    if pd.isna(x):
        return 0
    return int(x in ["died", "death"])


def map_age(x):
    try:
        x = float(x)
    except ValueError:
        if x[0] == x[-2] and x[1] == "0" and x[-1] == "9":
            x = int(x[:2])
        else:
            return np.nan
        
    return x // 10


def map_sex(x):
    if pd.isna(x):
        return np.nan
    
    return int(x == "male")
    
    
def days_diff(x, y, date_format="%d.%m.%Y"):
    try:
        x = dt.strptime(x, date_format)
        y = dt.strptime(y, date_format)
    except:
        return np.nan
    
    return (x - y).days

Converting data

In [6]:
data = pd.DataFrame(columns=[
    "HasDied",
    "Age",
    "Sex",
    "Country",
    "DaysInHospital",
    "DaysBeforeHospitalization",
    "DaysBeforeConfirmation"
])

country_dict = dict([(c, np.nan if pd.isna(c) else i) for i, c in enumerate(data_raw.country.unique())])

for i in range(data_raw.shape[0]):
    d = {}
    row = data_raw.loc[i, :]
    
    d["HasDied"] = map_outcome(row.outcome)
    d["Age"] = map_age(row.age) # Age i: corresponds to (10 * i, 10 * (i + 1) - 1)
    d["Sex"] = map_sex(row.sex) # 1 - man, 0 - woman
    d["Country"] = country_dict.get(row.country)
    d["DaysInHospital"] = days_diff(row.date_death_or_discharge, row.date_admission_hospital)
    d["DaysBeforeHospitalization"] = days_diff(row.date_admission_hospital, row.date_onset_symptoms)
    d["DaysBeforeConfirmation"] = days_diff(row.date_confirmation, row.date_onset_symptoms)
    d["DaysAfterConfirmation"] = days_diff(row.date_death_or_discharge, row.date_confirmation) # Days as confirmend affected
    
    # Symptoms
    for k, v in signif_symptoms.items():
        d[k] = np.nan if pd.isna(row.symptoms) else int(row.symptoms.find(v) != -1)
    
    data = data.append(d, ignore_index=True)

data.head(5)
Out[6]:
HasDied Age Sex Country DaysInHospital DaysBeforeHospitalization DaysBeforeConfirmation Asymptomatic BonePain ChestTightness ... Nausea Phlegm Pneumonia Pneumonitis RunnyNose ShortnessOfBreath Sneezing SoreThroat Sputum Weakness
0 0.0 3.0 1.0 0.0 NaN 2.0 4.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0.0 4.0 1.0 0.0 NaN 11.0 13.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0.0 4.0 1.0 0.0 NaN 5.0 8.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 0.0 4.0 0.0 0.0 NaN 3.0 6.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0.0 5.0 0.0 0.0 NaN 11.0 13.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 34 columns

Splitting the data

In [7]:
feature_names = data.columns[1:]

# Only records with outcome value not missing.
has_outcome = ~data_raw.outcome.isna()
X_train, X_test, y_train, y_test = train_test_split(
    np.array(data[has_outcome].dropna(thresh=3).iloc[:, 1:]),
    np.array(data[has_outcome].dropna(thresh=3).iloc[:, 0]),
    test_size=0.3,
    random_state=58
)
In [39]:
# Mapping missing values to -9999
X_train[np.isnan(X_train)] = -9999
X_test[np.isnan(X_test)] = -9999

Modelling

In [40]:
model = CatBoostClassifier(random_seed=58)
model.fit(X_train, y_train, verbose=False)
Out[40]:
<catboost.core.CatBoostClassifier at 0x1bff2bfa508>
In [41]:
# The classes are extremely unbalanced, hence its a good thing if the classfier predicted literally anything to be in class 1.
pred_labels = model.predict(X_test)
(pred_labels == 1).sum(), (y_test == 1).sum()
Out[41]:
(4, 3)
In [42]:
# Accuracy
(pred_labels == y_test).sum() / len(y_test)
Out[42]:
0.9791666666666666
In [43]:
# F1 score
f1_score(pred_labels, y_test)
Out[43]:
0.8571428571428571

Explaining an observation (LIME)

Task 2 - Selecting an observation (true positive)

In [44]:
np.where(np.logical_and(pred_labels == 1, y_test == 1))
Out[44]:
(array([27, 31, 43], dtype=int64),)
In [45]:
obs = X_test[43, :]
pd.DataFrame(obs, feature_names, columns=["Value"])
Out[45]:
Value
Age 7.0
Sex 1.0
Country 20.0
DaysInHospital -9999.0
DaysBeforeHospitalization -9999.0
DaysBeforeConfirmation -9999.0
Asymptomatic -9999.0
BonePain -9999.0
ChestTightness -9999.0
Chills -9999.0
Cough -9999.0
Coughing -9999.0
DaysAfterConfirmation 1.0
Diarrhea -9999.0
Discomfort -9999.0
Dyspnea -9999.0
Fatigue -9999.0
Fever -9999.0
Headache -9999.0
JointPain -9999.0
Malaise -9999.0
MuscleSoreness -9999.0
Myalgia -9999.0
Nausea -9999.0
Phlegm -9999.0
Pneumonia -9999.0
Pneumonitis -9999.0
RunnyNose -9999.0
ShortnessOfBreath -9999.0
Sneezing -9999.0
SoreThroat -9999.0
Sputum -9999.0
Weakness -9999.0

Model prediction for the observation

In [46]:
# Model predicts 1 - death of the patient.
model.predict(obs)
Out[46]:
1.0

Task 3 - Model explaination (LIME) for the observation

In [47]:
explainer = LimeTabularExplainer(X_train, class_names=["survival", "death"], feature_names=feature_names, discretize_continuous=False)
explaination = explainer.explain_instance(obs, model.predict_proba)
In [50]:
explaination.show_in_notebook(show_all=False)

Task 4

In [54]:
explainer.explain_instance(X_test[31, :], model.predict_proba).show_in_notebook(show_all=False)
In [55]:
explainer.explain_instance(X_test[1, :], model.predict_proba).show_in_notebook(show_all=False)
In [56]:
explainer.explain_instance(X_test[42, :], model.predict_proba).show_in_notebook(show_all=False)

Comment

The explainations appear to be very stable. Each of the sampled observations had more or less the same decompositions - lead by Age and Country and with other variables with little to no impact.

Task 5

Model training

In [57]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
Out[57]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Finding differing observations

In [59]:
def compare_attribution(i):
    explainer.explain_instance(X_test[i, :], model.predict_proba).show_in_notebook(show_all=False)
    explainer.explain_instance(X_test[i, :], rf_model.predict_proba).show_in_notebook(show_all=False)
In [64]:
compare_attribution(3)
In [65]:
compare_attribution(19)
In [66]:
compare_attribution(43)

Comment

The Random Forest classifier (the bottom summary for each observation) seems to have more balanced attribution of Age and Country variables than the Catboost classifier (the top summary).